In [1]:
import ucdeconvolve as ucd
import scanpy as sc
from spatialdata_io import xenium
# old numpy 1.26.4 in python_edger ; 2.2.4 pip
import spatialdata as sd
/home/s4716765/.conda/envs/edger/lib/python3.10/site-packages/dask/dataframe/__init__.py:31: FutureWarning: The legacy Dask DataFrame implementation is deprecated and will be removed in a future version. Set the configuration option `dataframe.query-planning` to `True` or None to enable the new Dask Dataframe implementation and silence this warning. warnings.warn(
In [2]:
import spatialdata as sd
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib.patches import Polygon
import numpy as np
import pandas as pd
import os
from spatialdata_io import xenium
def plot_xenium_spatial(
xenium_sdata,
x_region,
y_region,
gene1_name,
gene2_name,
cell_type_colors,
gene1_threshold=0,
gene2_threshold=3,
save_dir="lnc_revision/xenium/",
save_filename="Xenium_spatial_plot.pdf",
figsize=(15, 10),
dpi=300
):
"""
Plot Xenium spatial data with cell boundaries colored by cell type and transcripts for two genes.
Parameters:
-----------
xenium_sdata : spatialdata.SpatialData
Loaded Xenium SpatialData object (user-defined).
x_region : tuple
(min, max) x-coordinates for region of interest (user-defined).
y_region : tuple
(min, max) y-coordinates for region of interest (user-defined).
gene1_name : str
Name of the first gene to plot transcripts for (user-defined).
gene2_name : str
Name of the second gene to plot transcripts for (user-defined).
cell_type_colors : dict
Dictionary mapping cell types to colors (user-defined).
gene1_threshold : int, optional
Minimum transcript count per cell for gene1 (default: 0).
gene2_threshold : int, optional
Minimum transcript count per cell for gene2 (default: 3).
save_dir : str, optional
Directory to save the plot (default: "lnc_revision/xenium/").
save_filename : str, optional
Filename for the saved plot (default: "Xenium_spatial_plot.pdf").
figsize : tuple, optional
Figure size (width, height) (default: (15, 10)).
dpi : int, optional
DPI for the plot (default: 300).
Returns:
--------
None
Displays and saves the plot, or raises an error if genes are not found.
Raises:
-------
ValueError
If gene1_name or gene2_name is not found in transcripts['feature_name'].
"""
# Use xenium_sdata.tables['table']
adata = xenium_sdata.tables['table']
print("Using xenium_sdata.tables['table'] as adata")
print("adata shape:", adata.shape)
# Debug: Check adata basics
print("adata.obs columns:", adata.obs.columns.tolist())
print("adata.var_names sample:", adata.var_names[:10].tolist())
# Check if predicted.id exists
if 'predicted.id' not in adata.obs.columns:
print("Warning: predicted.id not found in adata.obs. Using placeholder.")
adata.obs['predicted.id'] = 'nan'
else:
print("adata.obs['predicted.id'] unique values:", adata.obs['predicted.id'].unique())
# Ensure adata.obs index uses cell_id
if 'cell_id' in adata.obs.columns:
adata.obs.index = adata.obs['cell_id'].astype(str)
else:
print("Error: cell_id not found in adata.obs.columns")
raise KeyError("cell_id missing")
# Convert predicted.id to strings and handle np.nan
adata.obs['predicted.id'] = adata.obs['predicted.id'].astype(str).replace('nan', np.nan).fillna("nan")
# Check for missing cell types
missing_cell_types = [ct for ct in adata.obs['predicted.id'].unique() if ct not in cell_type_colors]
if missing_cell_types:
print("Warning: Missing cell types in cell_type_colors:", missing_cell_types)
for ct in missing_cell_types:
cell_type_colors[ct] = "grey" # Add grey as fallback for missing types
# Create color mapping
cell_types_to_keep = list(adata.obs["predicted.id"].unique())
subset_cell_type_colors = {key: cell_type_colors[key] for key in cell_types_to_keep if key in cell_type_colors}
subset_cell_type_colors = dict(sorted(subset_cell_type_colors.items()))
cell_type_to_color = subset_cell_type_colors
# Extract spatial coordinates and filter to region
if 'spatial' in adata.obsm:
spatial_coords = adata.obsm['spatial']
print("Spatial coords range: x", spatial_coords[:, 0].min(), spatial_coords[:, 0].max(),
"y", spatial_coords[:, 1].min(), spatial_coords[:, 1].max())
mask = (spatial_coords[:, 0] >= x_region[0]) & (spatial_coords[:, 0] <= x_region[1]) & \
(spatial_coords[:, 1] >= y_region[0]) & (spatial_coords[:, 1] <= y_region[1])
adata_subset = adata[mask].copy()
spatial_coords_subset = adata_subset.obsm['spatial']
else:
print("Error: adata.obsm['spatial'] not found.")
raise KeyError("spatial missing")
print("adata_subset shape:", adata_subset.shape)
print("adata_subset.obs index sample:", adata_subset.obs.index[:5].tolist())
# Extract cell boundaries
try:
cell_boundaries = xenium_sdata.shapes['cell_boundaries']
except KeyError:
print("Error: Cell boundaries not found in xenium_sdata.shapes['cell_boundaries'].")
raise
# Debug: Inspect cell boundaries
print("cell_boundaries shape:", cell_boundaries.shape)
print("cell_boundaries index sample:", cell_boundaries.index[:5].tolist())
print("cell_boundaries geometry types:", cell_boundaries['geometry'].geom_type.value_counts())
matching_cells = cell_boundaries.index.isin(adata_subset.obs.index)
print("Number of matching cell IDs:", matching_cells.sum())
# Filter boundaries
cell_boundaries_filtered = cell_boundaries.loc[cell_boundaries.index.isin(adata_subset.obs.index)]
print("Filtered cell_boundaries shape:", cell_boundaries_filtered.shape)
# Extract transcript data
try:
transcripts = xenium_sdata.points['transcripts'].compute()
except AttributeError:
transcripts = xenium_sdata.points['transcripts']
transcripts = transcripts.dropna()
# Debug: Check transcripts
print("Transcripts shape:", transcripts.shape)
print("Transcripts cell_id sample:", transcripts['cell_id'].head().tolist())
# Search for gene1_name and gene2_name
print(f"Checking for {gene1_name} and {gene2_name} in feature_name:")
gene1_matches = transcripts['feature_name'].str.contains(gene1_name, case=False, na=False).sum()
gene2_matches = transcripts['feature_name'].str.contains(gene2_name, case=False, na=False).sum()
print(f"{gene1_name} matches: {gene1_matches}")
print(f"{gene2_name} matches: {gene2_matches}")
if gene1_matches == 0:
raise ValueError(f"Gene '{gene1_name}' not found in transcripts['feature_name']. Check available genes.")
if gene2_matches == 0:
raise ValueError(f"Gene '{gene2_name}' not found in transcripts['feature_name']. Check available genes.")
# Filter transcripts to region
transcripts_cropped = transcripts[
(transcripts['x'] >= x_region[0]) & (transcripts['x'] <= x_region[1]) &
(transcripts['y'] >= y_region[0]) & (transcripts['y'] <= y_region[1])
]
print("transcripts_cropped shape:", transcripts_cropped.shape)
# Aggregate transcripts per cell
transcripts_per_cell = transcripts_cropped.groupby(['cell_id', 'feature_name']).size().unstack(fill_value=0)
transcripts_per_cell = transcripts_per_cell.reindex(adata_subset.obs.index, fill_value=0)
print("transcripts_per_cell shape:", transcripts_per_cell.shape)
print(f"Max {gene1_name} counts:", transcripts_per_cell.get(gene1_name, 0).max())
print(f"Max {gene2_name} counts:", transcripts_per_cell.get(gene2_name, 0).max())
# Filter cells
gene1_cells = transcripts_per_cell[transcripts_per_cell.get(gene1_name, 0) > gene1_threshold].index
gene2_cells = transcripts_per_cell[transcripts_per_cell.get(gene2_name, 0) > gene2_threshold].index
print(f"Cells with {gene1_name} > {gene1_threshold}:", len(gene1_cells))
print(f"Cells with {gene2_name} > {gene2_threshold}:", len(gene2_cells))
# Get transcripts
gene1 = transcripts_cropped[
(transcripts_cropped['feature_name'] == gene1_name) &
(transcripts_cropped['cell_id'].isin(gene1_cells))
]
gene2 = transcripts_cropped[
(transcripts_cropped['feature_name'] == gene2_name) &
(transcripts_cropped['cell_id'].isin(gene2_cells))
]
print(f"{gene1_name} transcripts:", len(gene1))
print(f"{gene2_name} transcripts:", len(gene2))
# Set up the plot
with plt.rc_context({"figure.figsize": (5, 4), "figure.dpi": dpi}):
fig, ax = plt.subplots(figsize=figsize)
# Plot boundaries
boundaries_plotted = False
if cell_boundaries_filtered.shape[0] > 0:
for cell_id in adata_subset.obs.index:
if cell_id in cell_boundaries_filtered.index:
try:
geom = cell_boundaries_filtered.loc[cell_id, 'geometry']
if geom.geom_type == 'Polygon':
coords = np.array(geom.exterior.coords)
elif geom.geom_type == 'MultiPolygon':
coords = np.array(max(geom.geoms, key=lambda x: x.area).exterior.coords)
else:
continue
cell_type = adata_subset.obs.loc[cell_id, 'predicted.id']
color = cell_type_to_color.get(cell_type, 'grey')
polygon = Polygon(coords, facecolor=color, edgecolor='none', alpha=1)
ax.add_patch(polygon)
boundaries_plotted = True
except Exception as e:
print(f"Failed to plot cell {cell_id}: {e}")
continue
else:
print("No matching boundaries, plotting centroids")
# Fallback to centroids
if not boundaries_plotted:
cell_colors = adata_subset.obs['predicted.id'].map(cell_type_to_color).to_numpy()
ax.scatter(
spatial_coords_subset[:, 0], spatial_coords_subset[:, 1],
c=cell_colors, s=10, alpha=1, label='Cells'
)
# Plot gene expression
if not gene1.empty:
ax.scatter(
gene1['x'], gene1['y'],
color='yellow', marker='o', s=3, zorder=10, label=f'{gene1_name} (>{gene1_threshold} transcripts)'
)
if not gene2.empty:
ax.scatter(
gene2['x'], gene2['y'],
color='black', marker='o', s=3, zorder=10, label=f'{gene2_name} (>{gene2_threshold} transcripts)'
)
# Set aspect ratio
ax.set_aspect('equal', adjustable='box')
# Add legends
ax.legend(loc='upper right')
from matplotlib.lines import Line2D
legend_elements = [
Line2D([0], [0], marker='s', color='w', label=cell_type,
markerfacecolor=color, markersize=10, linestyle='None')
for cell_type, color in subset_cell_type_colors.items()
]
ax.legend(handles=legend_elements, loc='center left', bbox_to_anchor=(1, 0.5), title="Cell Types")
# Add title and labels
ax.set_title("Xenium Spatial Plot: Cell Types and Gene Expression")
ax.set_xlabel("X Coordinate")
ax.set_ylabel("Y Coordinate")
# Adjust layout
plt.tight_layout()
# Ensure save directory exists
os.makedirs(save_dir, exist_ok=True)
# Save plot
plt.savefig(os.path.join(save_dir, save_filename), bbox_inches="tight")
plt.show()
In [3]:
# ----------------------------
# Universal color scheme
# ----------------------------
cell_type_colors_Breast = {
"Cancer Epithelial Cells": "#FF0000", # Bright red (cancer)
"Epithelial Cells": "#FF6347", # Tomato
"Myoepithelial Cells": "#FFA07A", # Light Salmon
"Fibroblasts": "#228B22", # Forest Green
"Endothelial Cells": "#8B4513", #"#DC143C", # Crimson
"Perivascular-like (PVL) Cells": "#6A5ACD", # Slate Blue
"CD4+ T Cells": "#4169E1", # Royal Blue
"CD8+ T Cells": "#00CED1", # Dark Turquoise
"Regulatory T Cells": "#1E90FF", # Dodger Blue
"B Cells": "#DDA0DD", # Plum
"Plasma Cells": "#F1EA9D", # Pale Yellow
"NK Cells": "#32CD32", # Lime Green
"MDSCs": "#8B008B", # Dark Magenta
"Dendritic Cells": "#20B2AA", # Light Sea Green
"Monocytes": "#FFD700", # Gold
"Macrophages": "#FFDAB9", # Peach Puff
"Mast Cells": "#FF69B4", # Hot Pink
"nan": "#808080" # Grey
}
cell_type_colors_Melanoma = {
"Melanocytes": "#FF0000", # Bright red (cancer)
"KC Basal": "#FF4500", # Orange Red
"KC Granular": "#FF6347", # Tomato
"KC Differentiating": "#FFA500", # Orange
"KC Cornified": "#FFD700", # Gold
"KC Hair": "#DAA520", # Goldenrod
"Sweat gland related": "#98FB98", # Pale Green
"Fibroblast": "#228B22", # Forest Green
"Endothelial Cell": "#8B4513", # Crimson
"Pericytes": "#6A5ACD", # Slate Blue
"T Cell": "#4169E1", # Royal Blue
"NK": "#00CED1", # Dark Turquoise
"B Cell": "#DDA0DD", # Plum
"Plasma Cell": "#F1EA9D", # Pale Yellow
"Macrophage": "#FFDAB9", # Peach Puff
"DC": "#20B2AA", # Light Sea Green
"LC": "#00FF7F", # Spring Green
"Mast Cell": "#FF69B4", # Hot Pink
"nan": "#808080" # Grey
}
cell_type_colors_HN = {
"Epithelial": "#FF0000", # Bright red (cancer)
"CAFs": "#228B22", # Forest Green
"Myofib": "#32CD32", # Lime Green
"Endothelial": "#8B4513", # Crimson
"T cells": "#4169E1", # Royal Blue
"NK": "#00CED1", # Dark Turquoise
"B cells": "#DDA0DD", # Plum
"plasma cells": "#F1EA9D", # Pale Yellow
"TAMs": "#FFDAB9", # Peach Puff (macrophages/TAMs)
"mature DC": "#20B2AA", # Light Sea Green
"pDC": "#00FF7F", # Spring Green
"Mast cells": "#FF69B4", # Hot Pink
"Salivary": "#98FB98", # Pale Green
"nan": "#808080" # Grey
}
cell_type_colors_CRC = {
"Cancer_Epi": "#FF0000", # Bright red (cancer)
"Epithelial": "#FF6347", # Tomato
"Fibroblast 2": "#32CD32", # Lime Green
"Fibroblast 3": "#228B22", # Forest Green
"Endothelial/SM/Schwann": "#8B4513", # Crimson
"T CD4": "#4169E1", # Royal Blue
"T CD8": "#00CED1", # Dark Turquoise
"CD8+ CXCL13+ HSP+": "#00B7EB", # Sky Blue
"B cells": "#DDA0DD", # Plum
"Plasma cells": "#F1EA9D", # Pale Yellow
"Myeloid 1(Mono/Granulo)": "#FFD700", # Gold
"Myeloid 2 (Macrophages)": "#FFDAB9", # Peach Puff
"Myeloid 3 Dendritic": "#20B2AA", # Light Sea Green
"Mast cells": "#FF69B4", # Hot Pink
"nan": "#808080" # Grey
}
cell_type_colors_GBM = {
"GBM cell": "#FF0000", # Bright red (cancer)
"Neuron": "#4682B4", # Steel Blue
"Astrocyte": "#32CD32", # Lime Green
"Oligodendrocyte": "#DAA520", # Goldenrod
"Myeloid": "#FFD700", # Gold (microglia/macrophages)
"T cell": "#4169E1", # Royal Blue
"B cell": "#DDA0DD", # Plum (if present)
"NK cell": "#00CED1", # Dark Turquoise (if present)
"nan": "#808080" # Grey
}
Breast¶
In [ ]:
from spatialdata_io import xenium
# Load Xenium spatial data (replace with your actual path to the Xenium zarr or other format)
xenium_zarr_path = "/QRISdata/Q4386/Xenium_20250319__042556__20250319_lncRNA_IO_Cancer/output-XETG00114__0016985__BreastCancer__20250319__042625/" # Update this path
xenium_sdata = xenium(xenium_zarr_path)
# Load the metadata CSV file (replace with your actual path)
metadata_path = "lnc_revision/xenium/mel/clean-data/BreastCancer_LT_breast_ref_subsetted_ref.csv" # Update this path
metadata_df = pd.read_csv(metadata_path, index_col=0)
# Inspect the metadata to ensure it has the required columns
print("Metadata columns:", metadata_df.columns)
print("First few rows of metadata:\n", metadata_df.head())
# Inspect the xenium_sdata.table.obs to understand its structure
print("xenium_sdata.table.obs columns:", xenium_sdata.table.obs.columns)
print("First few rows of xenium_sdata.table.obs:\n", xenium_sdata.table.obs.head())
# Assume the metadata CSV has a 'cell_id' column and a 'cell_type' column
# Ensure the 'cell_id' column in metadata matches the index or a column in xenium_sdata.table.obs
# If cell_id is not the index in xenium_sdata.table.obs, identify the correct column (e.g., 'cell_id')
if 'cell_id' in xenium_sdata.table.obs.columns:
cell_id_col = 'cell_id'
else:
# If cell_id is the index of xenium_sdata.table.obs
cell_id_col = xenium_sdata.table.obs.index.name
xenium_sdata.table.obs[cell_id_col] = xenium_sdata.table.obs.index
# Merge the cell types from the metadata into xenium_sdata.table.obs
# Set the index of metadata_df to the cell_id column for merging
#metadata_df.set_index('cell_id', inplace=True)
# Ensure the cell_id column in xenium_sdata.table.obs is the index for proper joining
xenium_sdata.table.obs.set_index(cell_id_col, inplace=True)
# Join the cell_type column from metadata_df into xenium_sdata.table.obs
xenium_sdata.table.obs = xenium_sdata.table.obs.join(metadata_df['predicted.id'], how='left')
# Reset the index if needed (optional, depending on your downstream analysis)
xenium_sdata.table.obs.reset_index(inplace=True)
In [ ]:
#### Assuming xenium_sdata is already loaded
# xenium_sdata = sd.read_zarr("path_to_your_xenium_sdata.zarr")
cell_type_colors_Breast = {
"Cancer Epithelial Cells": "#ff4040",
"CD4+ T Cells": "#41baeb",
"B Cells": "#fed9b9",
"Plasma Cells": "#f1ea9d",
"NK Cells": "#99ca3e",
"CD8+ T Cells": "#406573",
"Fibroblasts": "#458b41",
"Epithelial Cells": "#d8a1a1",
"Myoepithelial Cells": "#a1d8a1",
"nan": "grey",
"Endothelial Cells": "#f8a41e",
"Perivascular-like (PVL) Cells": "#dca566",
"Mast Cells": "#7f8133",
"Macrophages": "#eae71d",
"Regulatory T Cells": "#bbe5f3",
"MDSCs": "#8b008b",
"Dendritic Cells": "#5f9d9e",
"Monocytes": "#9cc7a1",
}
In [24]:
plot_xenium_spatial(
xenium_sdata=xenium_sdata,
x_region=(0, 5000),
y_region=(6000, 8500),
gene1_name='BIRC5',
gene2_name='cuTAR215705',cell_type_colors=cell_type_colors_Breast,
save_filename="paper_Xenium_BIRC5_cuTAR215705_ERpos_boundaries_polygons.pdf"
)
Using xenium_sdata.tables['table'] as adata adata shape: (464673, 480) adata.obs columns: ['cell_id', 'transcript_counts', 'control_probe_counts', 'genomic_control_counts', 'control_codeword_counts', 'unassigned_codeword_counts', 'deprecated_codeword_counts', 'total_counts', 'cell_area', 'nucleus_area', 'nucleus_count', 'segmentation_method', 'region', 'z_level', 'cell_labels', 'predicted.id'] adata.var_names sample: ['A2M', 'ACE2', 'ACTA2', 'ACTB', 'ADAM28', 'AIF1', 'AIRE', 'AKT1', 'ALK', 'ANPEP'] adata.obs['predicted.id'] unique values: ['Cancer Epithelial Cells' 'CD4+ T Cells' 'Macrophages' 'Plasma Cells' 'Epithelial Cells' 'Fibroblasts' 'CD8+ T Cells' 'Endothelial Cells' 'B Cells' 'Regulatory T Cells' 'NK Cells' 'Myoepithelial Cells' 'Mast Cells' 'nan' 'Monocytes' 'Dendritic Cells' 'Perivascular-like (PVL) Cells' 'MDSCs'] Spatial coords range: x 6.468607425689697 8929.0244140625 y 4.90940523147583 9254.44140625 adata_subset shape: (108962, 480) adata_subset.obs index sample: ['aaakjklh-1', 'aaalfhfd-1', 'aaalfolm-1', 'aaalgdpe-1', 'aaalnebd-1'] cell_boundaries shape: (464673, 1) cell_boundaries index sample: ['aaaabalk-1', 'aaaaefcj-1', 'aaaaekcn-1', 'aaaaemhb-1', 'aaaaieck-1'] cell_boundaries geometry types: Polygon 464673 Name: count, dtype: int64 Number of matching cell IDs: 108962 Filtered cell_boundaries shape: (108962, 1) Transcripts shape: (72390350, 13) Transcripts cell_id sample: ['UNASSIGNED', 'UNASSIGNED', 'UNASSIGNED', 'UNASSIGNED', 'UNASSIGNED'] Checking for BIRC5 and cuTAR215705 in feature_name: BIRC5 matches: 23354 cuTAR215705 matches: 202753 transcripts_cropped shape: (15395766, 13)
/scratch/temp/16262444/ipykernel_1142355/1397492350.py:168: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. transcripts_per_cell = transcripts_cropped.groupby(['cell_id', 'feature_name']).size().unstack(fill_value=0)
transcripts_per_cell shape: (108962, 540) Max BIRC5 counts: 6 Max cuTAR215705 counts: 12 Cells with BIRC5 > 0: 2547 Cells with cuTAR215705 > 3: 2165 BIRC5 transcripts: 2924 cuTAR215705 transcripts: 10156